#!/bin/bash

# Small English Wikipedia dataset (~2M chunks).
get_wiki_tiny_config() {
    RETRO_INDEX_STR="IVF4096_HNSW4,Flat"
    RETRO_NCHUNKS_SAMPLED=2281307
    RETRO_GPT_TRAIN_SAMPLES=31250
    LR_DECAY_SAMPLES=2
    LR_WARMUP_SAMPLES=1
    RETRO_GPT_EVAL_INTERVAL=2000
    RETRO_GPT_EVAL_ITERS=100
    RETRO_EF_SEARCH=4
    RETRO_NPROBE=64
    DATALOADER_TYPE=cyclic
}

# English Wikipedia dataset (~67M chunks).
get_wiki_config() {
    RETRO_INDEX_STR="IVF262144_HNSW32,Flat"
    RETRO_NCHUNKS_SAMPLED=66625331
    RETRO_GPT_TRAIN_SAMPLES=2037248
    LR_DECAY_SAMPLES=2
    LR_WARMUP_SAMPLES=1
    RETRO_GPT_EVAL_INTERVAL=2000
    RETRO_GPT_EVAL_ITERS=100
    RETRO_EF_SEARCH=16
    RETRO_NPROBE=4096
    DATALOADER_TYPE=cyclic
}

# Full corpus (~5B chunks).
get_corpus_config() {
    RETRO_INDEX_STR="OPQ64_128,IVF4194304_HNSW32,PQ64"
    RETRO_NCHUNKS_SAMPLED=300000000
    RETRO_GPT_TRAIN_SAMPLES=192000000
    LR_DECAY_SAMPLES=166400000
    LR_WARMUP_SAMPLES=162761
    RETRO_GPT_EVAL_INTERVAL=2000
    RETRO_GPT_EVAL_ITERS=50
    RETRO_EF_SEARCH=32
    RETRO_NPROBE=4096
    DATALOADER_TYPE=single
}
